/*!
 * \copy
 *     Copyright (c)  2013, Cisco Systems
 *     All rights reserved.
 *
 *     Redistribution and use in source and binary forms, with or without
 *     modification, are permitted provided that the following conditions
 *     are met:
 *
 *        * Redistributions of source code must retain the above copyright
 *          notice, this list of conditions and the following disclaimer.
 *
 *        * Redistributions in binary form must reproduce the above copyright
 *          notice, this list of conditions and the following disclaimer in
 *          the documentation and/or other materials provided with the
 *          distribution.
 *
 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 *     POSSIBILITY OF SUCH DAMAGE.
 *
 */

#ifdef HAVE_NEON_AARCH64
#include "arm_arch64_common_macro.S"

.macro LOAD_LUMA_DATA
    sub     x7, x0, x1
    ld1     {v0.16b}, [x7]      //top
    sub     x7, x0, #1
    ld1     {v1.b}[0], [x7], x1
    ld1     {v1.b}[1], [x7], x1
    ld1     {v1.b}[2], [x7], x1
    ld1     {v1.b}[3], [x7], x1
    ld1     {v1.b}[4], [x7], x1
    ld1     {v1.b}[5], [x7], x1
    ld1     {v1.b}[6], [x7], x1
    ld1     {v1.b}[7], [x7], x1
    ld1     {v1.b}[8], [x7], x1
    ld1     {v1.b}[9], [x7], x1
    ld1     {v1.b}[10], [x7], x1
    ld1     {v1.b}[11], [x7], x1
    ld1     {v1.b}[12], [x7], x1
    ld1     {v1.b}[13], [x7], x1
    ld1     {v1.b}[14], [x7], x1
    ld1     {v1.b}[15], [x7]    //left
.endm

.macro LOAD_16X4_DATA
    //Load the p_enc data and save to "v22 ~ v25"--- 16X4 bytes
    ld1     {v0.16b}, [x2], x3
    ld1     {v1.16b}, [x2], x3
    ld1     {v20.16b}, [x2], x3
    ld1     {v21.16b}, [x2], x3
    trn1    v22.4s, v0.4s, v1.4s
    trn2    v23.4s, v0.4s, v1.4s
    trn1    v24.4s, v20.4s, v21.4s
    trn2    v25.4s, v20.4s, v21.4s
.endm

.macro GET_16X16_V_SATD
    trn1    v6.4s, v4.4s, v5.4s
    trn2    v7.4s, v4.4s, v5.4s
    add     v4.8h, v6.8h, v7.8h
    sub     v5.8h, v6.8h, v7.8h
    trn1    v6.8h, v4.8h, v5.8h
    trn2    v7.8h, v4.8h, v5.8h
    add     v4.8h, v6.8h, v7.8h
    sub     v5.8h, v6.8h, v7.8h
    trn1    v6.4s, v4.4s, v5.4s
    trn2    v7.4s, v4.4s, v5.4s     //{0,1,3,2, 4,5,7,6} v6 {8,9,11,10, 12,13,15,14} v7
.endm

.macro GET_16X16_H_SATD
    trn1    v16.4s, v4.4s, v5.4s
    trn2    v17.4s, v4.4s, v5.4s
    add     v4.8h, v16.8h, v17.8h
    sub     v5.8h, v16.8h, v17.8h
    trn1    v16.8h, v4.8h, v5.8h
    trn2    v17.8h, v4.8h, v5.8h
    add     v4.8h, v16.8h, v17.8h
    sub     v5.8h, v16.8h, v17.8h
    trn1    v16.4s, v4.4s, v5.4s
    trn2    v17.4s, v4.4s, v5.4s    //{0,1,3,2, 4,5,7,6} v16 {8,9,11,10, 12,13,15,14} v17
.endm

.macro SELECT_BEST_COST arg0, arg1, arg2
    cmp     w1, \arg0
    csel    \arg0, \arg0, w1, \arg2
    cset    w7, \arg1
    cmp     w2, \arg0
    mov     w6, #2
    csel    \arg0, \arg0, w2, \arg2
    csel    w7, w7, w6, \arg2
.endm

.macro SELECT_BEST_COST_PREFER_HIGHER arg0
    SELECT_BEST_COST \arg0, ls, hi
.endm

.macro SELECT_BEST_COST_PREFER_LOWER arg0
    SELECT_BEST_COST \arg0, lo, hs
.endm

.macro LOAD_CHROMA_DATA arg0, arg1, arg2
    sub     x9, \arg0, x1
    ld1     {\arg1}, [x9]      //top_cb
    sub     x9, \arg0, #1
    ld1     {\arg2}[8], [x9], x1
    ld1     {\arg2}[9], [x9], x1
    ld1     {\arg2}[10], [x9], x1
    ld1     {\arg2}[11], [x9], x1
    ld1     {\arg2}[12], [x9], x1
    ld1     {\arg2}[13], [x9], x1
    ld1     {\arg2}[14], [x9], x1
    ld1     {\arg2}[15], [x9], x1 //left_cb
.endm

.macro LOAD_8X4_DATA arg0
    //Load the p_enc data and save to "v20 ~ v21"--- 8X4 bytes
    ld1     {v0.8b}, [\arg0], x3
    ld1     {v1.8b}, [\arg0], x3
    ld1     {v0.d}[1], [\arg0], x3
    ld1     {v1.d}[1], [\arg0], x3
    trn1    v2.4s, v0.4s, v1.4s
    trn2    v1.4s, v0.4s, v1.4s
    trn1    v20.2d, v2.2d, v1.2d
    trn2    v21.2d, v2.2d, v1.2d
.endm

.macro HDM_TRANSFORM_4X4_L0 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
    //Do the vertical transform
    uadd\arg9\()   v0.8h, \arg0, \arg1
    usub\arg9\()   v1.8h, \arg0, \arg1
    trn1    v3.2d, v0.2d, v1.2d
    trn2    v1.2d, v0.2d, v1.2d
    add     v4.8h, v3.8h, v1.8h //{0,1,2,3,4,5,6,7}
    sub     v5.8h, v3.8h, v1.8h //{12,13,14,15,8,9,10,11}

    //Do the horizontal transform
    trn1    v0.4s, v4.4s, v5.4s
    trn2    v1.4s, v4.4s, v5.4s
    add     v4.8h, v0.8h, v1.8h
    sub     v5.8h, v0.8h, v1.8h
    trn1    v0.8h, v4.8h, v5.8h
    trn2    v1.8h, v4.8h, v5.8h
    add     v4.8h, v0.8h, v1.8h
    sub     v5.8h, v0.8h, v1.8h

    //16x16_v
    trn1    v0.2s, v4.2s, v5.2s
    trn2    v1.2s, v4.2s, v5.2s
    sabal   \arg5, v0.4h, \arg2
    sabal   \arg5, v1.4h, \arg8\().4h
    sabal2  \arg5, v4.8h, \arg8\().8h
    sabal2  \arg5, v5.8h, \arg8\().8h

    //16x16_h
    ins     v3.d[0], v4.d[1]
    trn1    v0.4h, v4.4h, v3.4h
    trn2    v1.4h, v4.4h, v3.4h
    sabal   \arg6, v0.4h, \arg3
    sabdl   v4.4s, v1.4h, \arg8\().4h
    sabal   v4.4s, v5.4h, \arg8\().4h
    sabal2  v4.4s, v5.8h, \arg8\().8h
    add     \arg6, \arg6, v4.4s

    //16x16_dc_both
    sabal   \arg7, v0.4h, \arg4
    add     \arg7, \arg7, v4.4s
.endm

WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Sad_AArch64_neon
    ldr     x11, [sp, #0]

    LOAD_CHROMA_DATA x0, v0.8b, v0.b

    uaddlp  v1.8h, v0.16b
    uaddlp  v2.4s, v1.8h
    ins     v3.d[0], v2.d[1]
    add     v3.2s, v2.2s, v3.2s
    urshr   v2.4s, v2.4s, #2
    urshr   v3.2s, v3.2s, #3

    dup     v20.8b, v3.b[0]
    dup     v21.8b, v2.b[4]
    dup     v22.8b, v2.b[12]
    dup     v23.8b, v3.b[4]
    ins     v20.s[1], v21.s[0]
    ins     v22.s[1], v23.s[0]

    LOAD_CHROMA_DATA x7, v4.8b, v4.b

    uaddlp  v5.8h, v4.16b
    uaddlp  v6.4s, v5.8h
    ins     v7.d[0], v6.d[1]
    add     v7.2s, v6.2s, v7.2s
    urshr   v6.4s, v6.4s, #2
    urshr   v7.2s, v7.2s, #3

    dup     v24.8b, v7.b[0]
    dup     v25.8b, v6.b[4]
    dup     v26.8b, v6.b[12]
    dup     v27.8b, v7.b[4]
    ins     v24.s[1], v25.s[0]
    ins     v26.s[1], v27.s[0]

    sub     x9, x0, #1
    sub     x10, x7, #1

    ld1     {v3.8b}, [x2], x3
    ld1     {v5.8b}, [x11], x3

    ld1r    {v6.8b}, [x9], x1
    ld1r    {v7.8b}, [x10], x1

    uabdl   v29.8h, v0.8b, v3.8b
    uabal   v29.8h, v4.8b, v5.8b   //top

    uabdl   v30.8h, v6.8b, v3.8b
    uabal   v30.8h, v7.8b, v5.8b   //left

    uabdl   v31.8h, v20.8b, v3.8b
    uabal   v31.8h, v24.8b, v5.8b   //Dc
.rept 3
    ld1     {v3.8b}, [x2], x3
    ld1     {v5.8b}, [x11], x3

    ld1r    {v6.8b}, [x9], x1
    ld1r    {v7.8b}, [x10], x1

    uabal   v29.8h, v0.8b, v3.8b
    uabal   v29.8h, v4.8b, v5.8b   //top

    uabal   v30.8h, v6.8b, v3.8b
    uabal   v30.8h, v7.8b, v5.8b   //left

    uabal   v31.8h, v20.8b, v3.8b
    uabal   v31.8h, v24.8b, v5.8b   //Dc
.endr

.rept 4
    ld1     {v3.8b}, [x2], x3
    ld1     {v5.8b}, [x11], x3

    ld1r    {v6.8b}, [x9], x1
    ld1r    {v7.8b}, [x10], x1

    uabal   v29.8h, v0.8b, v3.8b
    uabal   v29.8h, v4.8b, v5.8b   //top

    uabal   v30.8h, v6.8b, v3.8b
    uabal   v30.8h, v7.8b, v5.8b   //left

    uabal   v31.8h, v22.8b, v3.8b
    uabal   v31.8h, v26.8b, v5.8b   //Dc
.endr

    saddlv  s29, v29.8h
    fmov    w2, s29
    add     w2, w2, w5, lsl #1
    saddlv  s30, v30.8h
    fmov    w1, s30
    add     w1, w1, w5, lsl #1
    saddlv  s31, v31.8h
    fmov    w0, s31

    SELECT_BEST_COST_PREFER_HIGHER w0

    str     w7, [x4]
WELS_ASM_AARCH64_FUNC_END

WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra16x16Combined3Sad_AArch64_neon

    LOAD_LUMA_DATA

    uaddlv    h2, v0.16b
    uaddlv    h3, v1.16b
    add       v2.8h, v2.8h, v3.8h
    uqrshrn   b2, h2, #5
    dup       v2.16b, v2.b[0]   //Dc

    sub     x7, x0, #1
    ld1     {v3.16b}, [x2], x3
    ld1r    {v4.16b}, [x7], x1

    uabdl   v29.8h, v0.8b, v3.8b
    uabal2  v29.8h, v0.16b,v3.16b   //top

    uabdl   v30.8h, v4.8b, v3.8b
    uabal2  v30.8h, v4.16b,v3.16b   //left

    uabdl   v31.8h, v2.8b, v3.8b
    uabal2  v31.8h, v2.16b,v3.16b   //Dc
    mov     x6, #15
sad_intra_16x16_x3_opt_loop0:
    ld1     {v3.16b}, [x2], x3
    ld1r    {v4.16b}, [x7], x1

    uabal   v29.8h, v0.8b, v3.8b
    uabal2  v29.8h, v0.16b,v3.16b   //top

    uabal   v30.8h, v4.8b, v3.8b
    uabal2  v30.8h, v4.16b,v3.16b   //left

    uabal   v31.8h, v2.8b, v3.8b
    uabal2  v31.8h, v2.16b,v3.16b   //Dc
    sub     x6, x6, #1
    cbnz    x6,  sad_intra_16x16_x3_opt_loop0

    saddlv  s29, v29.8h
    fmov    w0, s29
    saddlv  s30, v30.8h
    fmov    w1, s30
    add     w1, w1, w5, lsl #1
    saddlv  s31, v31.8h
    fmov    w2, s31
    add     w2, w2, w5, lsl #1

    SELECT_BEST_COST_PREFER_LOWER w0

    str     w7, [x4]
WELS_ASM_AARCH64_FUNC_END

WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra4x4Combined3Satd_AArch64_neon
    sub     x9, x0, x1
    ld1     {v16.s}[0], [x9]      //top
    sub     x9, x0, #1
    ld1     {v16.b}[4], [x9], x1
    ld1     {v16.b}[5], [x9], x1
    ld1     {v16.b}[6], [x9], x1
    ld1     {v16.b}[7], [x9], x1


    uaddlv  h2, v16.8b
    uqrshrn b17, h2, #3
    urshr   v2.4h, v2.4h, #3
    shl     v2.4h, v2.4h, #4

    //Calculate the 4x4_v 4x4_h mode SATD and save to "v6, v7"
    ushll   v4.8h, v16.8b, #2
    ins     v5.d[0], v4.d[1]
    trn1    v6.2s, v4.2s, v5.2s
    trn2    v7.2s, v4.2s, v5.2s

    add     v4.4h, v6.4h, v7.4h
    sub     v5.4h, v6.4h, v7.4h
    trn1    v6.4h, v4.4h, v5.4h
    trn2    v7.4h, v4.4h, v5.4h
    add     v4.4h, v6.4h, v7.4h
    sub     v5.4h, v6.4h, v7.4h
    trn1    v6.2s, v4.2s, v5.2s
    trn2    v7.2s, v4.2s, v5.2s     //{0,1,3,2,top} v6 {0,1,3,2,left} v7

    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH
    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H
    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V
    eor     v28.16b, v28.16b, v28.16b  //For zero register

    //Load the p_enc data and save to "v22 ~ v23"--- 16X4 bytes
    ld1     {v22.s}[0], [x2], x3
    ld1     {v22.s}[1], [x2], x3
    ld1     {v23.s}[0], [x2], x3
    ld1     {v23.s}[1], [x2], x3

    HDM_TRANSFORM_4X4_L0 v22.8b, v23.8b, v6.4h, v7.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l

    ldr     x11, [sp, #0]
    urshr   v29.4s, v29.4s, #1
    addv    s29, v29.4s
    fmov    w0, s29
    add     w0, w0, w11

    urshr   v30.4s, v30.4s, #1
    addv    s30, v30.4s
    fmov    w1, s30
    add     w1, w1, w7

    urshr   v31.4s, v31.4s, #1
    addv    s31, v31.4s
    fmov    w2, s31
    add     w2, w2, w6

    mov     w10, w0
    SELECT_BEST_COST_PREFER_HIGHER w10

    str     w7, [x5]

    sub     w9, w10, w2
    cbnz    w9, satd_intra_4x4_x3_opt_jump0
    dup     v0.16b, v17.b[0]
    st1     {v0.16b}, [x4]
    b       satd_intra_4x4_x3_opt_end

satd_intra_4x4_x3_opt_jump0:
    sub     w8, w10, w1
    cbnz    w8, satd_intra_4x4_x3_opt_jump1
    dup     v0.16b, v16.b[4]
    dup     v1.16b, v16.b[5]
    dup     v2.16b, v16.b[6]
    dup     v3.16b, v16.b[7]
    st4     {v0.s,v1.s,v2.s,v3.s}[0], [x4]
    b       satd_intra_4x4_x3_opt_end

satd_intra_4x4_x3_opt_jump1:
    st1     {v16.S}[0], [x4], #4
    st1     {v16.S}[0], [x4], #4
    st1     {v16.S}[0], [x4], #4
    st1     {v16.S}[0], [x4]
satd_intra_4x4_x3_opt_end:
    mov     w0, w10

WELS_ASM_AARCH64_FUNC_END

WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra8x8Combined3Satd_AArch64_neon
    ldr     x11, [sp, #0]

    LOAD_CHROMA_DATA x0, v0.8b, v0.b

    LOAD_CHROMA_DATA x7, v1.8b, v1.b

    //Calculate the 16x16_v mode SATD and save to "v6, v7"
    ushll   v4.8h, v0.8b, #2
    ushll   v5.8h, v1.8b, #2
    GET_16X16_V_SATD

    //Calculate the 16x16_h mode SATD and save to "v16, v17"
    ushll2  v4.8h, v0.16b, #2
    ushll2  v5.8h, v1.16b, #2
    GET_16X16_H_SATD

    uaddlp  v0.8h, v0.16b
    uaddlp  v2.4s, v0.8h
    ins     v3.d[0], v2.d[1]
    add     v3.2s, v2.2s, v3.2s

    uaddlp  v1.8h, v1.16b
    uaddlp  v4.4s, v1.8h
    ins     v5.d[0], v4.d[1]
    add     v5.2s, v4.2s, v5.2s

    trn2    v0.4s, v2.4s, v4.4s
    urshr   v0.4s, v0.4s, #2
    urshr   v3.2s, v3.2s, #3
    urshr   v5.2s, v5.2s, #3

    ushll   v22.2d, v0.2s, #4    //{1cb, 1cr}
    ushll2  v23.2d, v0.4s, #4    //{2cb, 2cr}
    ushll   v24.2d, v3.2s, #4   //{0cb, 3cb}
    ushll   v25.2d, v5.2s, #4   //{0cr, 3cr}

    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH
    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H
    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V
    eor     v28.16b, v28.16b, v28.16b  //For zero register

    ins     v18.d[0], v6.d[1]
    ins     v19.d[0], v7.d[1]
    ins     v26.d[0], v16.d[1]
    ins     v27.d[0], v17.d[1]

    LOAD_8X4_DATA x2

    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v16.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l
    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v16.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2

    LOAD_8X4_DATA x11

    ins     v22.d[0], v22.d[1]
    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v17.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l
    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v17.4h, v22.4h, v29.4s, v30.4s, v31.4s, v28, l2

    LOAD_8X4_DATA x2

    ins     v24.d[0], v24.d[1]
    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v6.4h, v26.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l
    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v18.4h, v26.4h, v24.4h, v29.4s, v30.4s, v31.4s, v28, l2

    LOAD_8X4_DATA x11

    ins     v23.d[0], v23.d[1]
    ins     v25.d[0], v25.d[1]
    HDM_TRANSFORM_4X4_L0 v20.8b, v21.8b, v7.4h, v27.4h, v23.4h, v29.4s, v30.4s, v31.4s, v28, l
    HDM_TRANSFORM_4X4_L0 v20.16b, v21.16b, v19.4h, v27.4h, v25.4h, v29.4s, v30.4s, v31.4s, v28, l2

    urshr   v29.4s, v29.4s, #1
    addv    s29, v29.4s
    fmov    w2, s29
    add     w2, w2, w5, lsl #1

    urshr   v30.4s, v30.4s, #1
    addv    s30, v30.4s
    fmov    w1, s30
    add     w1, w1, w5, lsl #1

    urshr   v31.4s, v31.4s, #1
    addv    s31, v31.4s
    fmov    w0, s31

    SELECT_BEST_COST_PREFER_HIGHER w0

    str     w7, [x4]
WELS_ASM_AARCH64_FUNC_END


WELS_ASM_AARCH64_FUNC_BEGIN WelsIntra16x16Combined3Satd_AArch64_neon
    LOAD_LUMA_DATA

    uaddlv  h2, v0.16b
    uaddlv  h3, v1.16b
    add     v2.8h, v2.8h, v3.8h
    urshr   v2.4h, v2.4h, #5
    shl     v2.4h, v2.4h, #4

    //Calculate the 16x16_v mode SATD and save to "v6, v7"
    ushll   v4.8h, v0.8b, #2
    ushll2  v5.8h, v0.16b, #2
    GET_16X16_V_SATD

    //Calculate the 16x16_h mode SATD and save to "v16, v17"
    ushll   v4.8h, v1.8b, #2
    ushll2  v5.8h, v1.16b, #2
    GET_16X16_H_SATD

    eor     v31.16b, v31.16b, v31.16b  //Save the SATD of DC_BOTH
    eor     v30.16b, v30.16b, v30.16b  //Save the SATD of H
    eor     v29.16b, v29.16b, v29.16b  //Save the SATD of V
    eor     v28.16b, v28.16b, v28.16b  //For zero register

    ins     v18.d[0], v6.d[1]
    ins     v19.d[0], v7.d[1]
    ins     v26.d[0], v16.d[1]
    ins     v27.d[0], v17.d[1]

    LOAD_16X4_DATA

    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v16.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

    LOAD_16X4_DATA

    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v26.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

    LOAD_16X4_DATA

    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v17.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

    LOAD_16X4_DATA

    HDM_TRANSFORM_4X4_L0 v22.8b, v24.8b, v6.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
    HDM_TRANSFORM_4X4_L0 v22.16b, v24.16b, v7.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2
    HDM_TRANSFORM_4X4_L0 v23.8b, v25.8b, v18.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l
    HDM_TRANSFORM_4X4_L0 v23.16b, v25.16b, v19.4h, v27.4h, v2.4h, v29.4s, v30.4s, v31.4s, v28, l2

    urshr   v29.4s, v29.4s, #1
    addv    s29, v29.4s
    fmov    w0, s29

    urshr   v30.4s, v30.4s, #1
    addv    s30, v30.4s
    fmov    w1, s30
    add     w1, w1, w5, lsl #1

    urshr   v31.4s, v31.4s, #1
    addv    s31, v31.4s
    fmov    w2, s31
    add     w2, w2, w5, lsl #1

    SELECT_BEST_COST_PREFER_LOWER w0

    str     w7, [x4]

WELS_ASM_AARCH64_FUNC_END

#endif
